setwd("C:/Users/Nicole/Desktop/Spring 21 UROP")
library(data.table)
library(dplyr)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(RColorBrewer)
library(data.table)
library(magrittr)
library(ggrepel)
library(stringr)
library(plotly)
library(pracma)
library(GeneralizedUmatrix)
library(dendextend)
library(ProjectionBasedClustering)
library(fastICA)
library(ComplexHeatmap)
library(ggplotify)
library(circlize)
library(cowplot)
library(stringi)
library(ggpubr)
library(UpSetR)

source('C:/Users/Nicole/Desktop/Spring 21 UROP/050522_PhosphoproteomicsPipeline_companion.R')

annotation.phenotype <- read.csv('GT1_Annotation_Phenotype.csv', fill = T, stringsAsFactors = F, check.names = T)
conservations <- read.csv('GT1_ConservationCategories.csv', fill = T, stringsAsFactors = F, check.names = T) %>% select(GT1.ID,type)
conservations$type %<>% stri_extract_first(regex = '(?<=\\.)[[:alnum:]]*')
annotation.phenotype <- left_join(annotation.phenotype, conservations, by = c("ID" = 'GT1.ID')) %>% suppressMessages()
phyre2 <- read.csv('Toxo_PhyrePredictions_TopHit.csv', stringsAsFactors = F)
ortholog.conversion <- read.csv('2019_11_11_GenesByOrthologs_Summary_ME49_to_TGGT1.csv',
                                fill = T, stringsAsFactors = F, check.names = T, as.is = TRUE) %>%
  select( 'Gene.ID', 'Input.Ortholog.s.') %>% as.data.frame()
lopit <-  read.csv( 'LOPIT_raw_all.csv',
                    fill = T, stringsAsFactors = F, check.names = T, as.is = T ) %>%
  select( 'Gene_ID', 'Description', 'localisation.prediction')

all.data <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_PHOSPHO_ZAP_b_PeptideGroupsAll.csv', fill = T, stringsAsFactors = F, check.names = T)
all.data$Master.Protein.Accessions %<>% stri_extract_first(regex = '.*(?=-t26_1-p1)')
all.data <- left_join(all.data, annotation.phenotype, by = c("Master.Protein.Accessions" = 'ID')) %>% suppressMessages()
all.data$ME49.ID <- lapply(all.data$Master.Protein.Accessions, function(x) {
  id.list <- ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] %>% as.list()
  return (id.list[1] %>% as.character())}) %>% as.character()
# all.data$ME49.ID <- lapply(all.data$Master.Protein.Accessions, function(x) 
#   ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] ) %>% as.character()
all.data %<>% left_join(phyre2, by = c("ME49.ID" = 'ID'))

all.data %<>% left_join(lopit, by = c("Master.Protein.Accessions" = 'Gene_ID'))

initialcount0 <- nrow(all.data)
all.data %<>% subset(grepl("^TGGT1", all.data$Master.Protein.Accessions))
print(paste0("Removing any human peptides: ", (nrow(all.data) - initialcount0)))
initial.data <- all.data

# ***CDPK1-dependent analysis
# *First replicate analysis ----
# Removing any peptide that was not found at every timepoint ----
initialcount1 <- nrow(all.data)
all.data <- all.data %>% subset(Quan.Info != "No Quan Values")

all.data <- all.data %>% subset((!is.na(Abundance..F1..127N..Sample..1..PBS..0)) &
                                  (!is.na(Abundance..F1..129N..Sample..1..PBS..9)) &
                                  (!is.na(Abundance..F1..131N..Sample..1..PBS..30)) &
                                  (!is.na(Abundance..F1..133N..Sample..1..PBS..300)) &
                                  (!is.na(Abundance..F1..126..Sample..1..IAA..0)) &
                                  (!is.na(Abundance..F1..128C..Sample..1..IAA..9)) &
                                  (!is.na(Abundance..F1..130C..Sample..1..IAA..30)) &
                                  (!is.na(Abundance..F1..132C..Sample..1..IAA..300)))
print(paste0("Removing any peptide that was not found at every timepoint: ", (nrow(all.data) - initialcount1)))

# Removing any non phosphopeptides ----
all.data$Phosphorylated <- "No"
initialcount2 <- nrow(all.data)
for (i in 1:length(all.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", all.data$Modifications[i]) == "TRUE")){
    all.data$Phosphorylated[i] <- "Yes"
  }
}
all.data %<>% subset(Phosphorylated == "Yes")

print(paste0("Removing any non-phosphopeptides: ", (nrow(all.data) - initialcount2)))
all.data.unfiltered <- all.data
all.data.unfiltered.R1 <- all.data

# Removing peptides without a high L2FC between PBS 0 and PBS30 ----
initialcount6 <- nrow(all.data)
all.data %<>% subset(Abundance.Ratio..log2....PBS..30.....PBS..0. > .5 | Abundance.Ratio..log2....PBS..30.....PBS..0. < -.5 )
print(paste0("Removing peptides without a high L2FC between PBS 0 and PBS30: ", (nrow(all.data) - initialcount6)))

# Removing peptides without a large AUC difference between PBS and IAA profile ----
initialcount5 <- nrow(all.data)

all.data.unfiltered$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
all.data.unfiltered$AUC.PBS <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1."]
                                     )))})
all.data.unfiltered$AUC.IAA <- apply(all.data.unfiltered[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1."]
                                     )))})
all.data.unfiltered$AUC.diff <- all.data.unfiltered$AUC.PBS - all.data.unfiltered$AUC.IAA
all.data.unfiltered$AUC.IAA.SSP <- apply(all.data.unfiltered[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                                                "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                                                "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                                                "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.")], 1,
                                         function(y) {
                                           iaa.start <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1."]
                                           iaa.new0 <- 0
                                           iaa.new9 <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1."] - iaa.start
                                           iaa.new30 <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1."] - iaa.start
                                           iaa.new300 <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1."] - iaa.start
                                           
                                           return(trapz(x = times, y = c(
                                             iaa.new0,
                                             iaa.new9,
                                             iaa.new30,
                                             iaa.new300
                                           )))})
## Selection method: null distribution = DMSO for AUC diff
dmso.data <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_PHOSPHO_DMSO_b_PeptideGroupsAll.csv', fill = T, stringsAsFactors = F, check.names = T)
dmso.data %<>% subset(grepl("^TGGT1", all.data$Master.Protein.Accessions))

dmso.data <- dmso.data %>% subset(Quan.Info != "No Quan Values")
dmso.data <- dmso.data %>% subset((!is.na(Abundance..F3..127N..Sample..1..PBS..0)) &
                                    (!is.na(Abundance..F3..129N..Sample..1..PBS..9)) &
                                    (!is.na(Abundance..F3..131N..Sample..1..PBS..30)) &
                                    (!is.na(Abundance..F3..133N..Sample..1..PBS..300)) &
                                    (!is.na(Abundance..F3..126..Sample..1..IAA..0)) &
                                    (!is.na(Abundance..F3..128C..Sample..1..IAA..9)) &
                                    (!is.na(Abundance..F3..130C..Sample..1..IAA..30)) &
                                    (!is.na(Abundance..F3..132C..Sample..1..IAA..300)))
dmso.data$Phosphorylated <- "No"
for (i in 1:length(dmso.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", dmso.data$Modifications[i]) == "TRUE")){
    dmso.data$Phosphorylated[i] <- "Yes"
  }
}
dmso.data %<>% subset(Phosphorylated == "Yes")

dmso.data$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
dmso.data$AUC.PBS <- apply(dmso.data[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1.")], 1,
                           function(y) {return(trapz(x = times, y = c(
                             y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1."]
                           )))})
dmso.data$AUC.IAA <- apply(dmso.data[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.")], 1,
                           function(y) {return(trapz(x = times, y = c(
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1."]
                           )))})
dmso.data$AUC.diff <- dmso.data$AUC.PBS - dmso.data$AUC.IAA

## P-value generation
all.data.unfiltered$zscore.AUC.ZAP <- (all.data.unfiltered$AUC.diff - (mean(dmso.data$AUC.diff)/sd(dmso.data$AUC.diff)))
all.data.unfiltered$pvalue.AUC.ZAP <- 2*pnorm(abs(all.data.unfiltered$zscore.AUC.ZAP), lower.tail = FALSE)
par(mfrow=c(1, 2))
hist(all.data.unfiltered$pvalue.AUC.ZAP, main = 'P-value distribution', xlab = "P-value")
hist(all.data.unfiltered$zscore.AUC.ZAP, main = "Z-score distribution", xlab = "Z-score")

all.data %<>% left_join(all.data.unfiltered) %>% suppressMessages()

all.data.ranked <- all.data[order(all.data$AUC.diff),]
all.data.ranked %<>% subset(pvalue.AUC.ZAP < .05)

print(paste0("Removing peptides without a large AUC difference between PBS and IAA profile: ", (nrow(all.data.ranked) - initialcount5)))

# Saving significant data ----
# fwrite(all.data.ranked, file = "081121_significant_timecourse_data_r1.csv")
r1.data <- all.data.ranked
r1.data.all <- all.data.unfiltered

# Clustering ----

hist(all.data.ranked$pvalue.AUC.ZAP %>% trim())

clustering.data <- all.data.ranked %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.",
                                                        "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
all.data.ranked$Callout.Name <- sapply(all.data.ranked$Master.Protein.Accessions,
                                       function (x) {
                                         if(x == "TGGT1_289100"){
                                           return ("- Hook")
                                         }
                                         else( return(" ") )
                                       })

grobtosave <- plot_clusters_with_heatmap(all.data.ranked, 
                                         title = "Projection Based Clustering of CDPK1-dependent phosphopeptides (R1)",
                                         cluster.method = 'projection_based', V = cluster.vector)
# ggsave("072721_Clusters_R1.pdf", plot = grobtosave, scale = 2.2)


# *Second replicate analysis ----
# Removing any peptide that was not found at every timepoint ----
initialcount1 <- nrow(initial.data)
all.data <- initial.data %>% subset(Quan.Info != "No Quan Values")
all.data <- all.data %>% subset((!is.na(Abundance..F1..128N..Sample..2..PBS..0)) &
                                  (!is.na(Abundance..F1..130N..Sample..2..PBS..9)) &
                                  (!is.na(Abundance..F1..132N..Sample..2..PBS..30)) &
                                  (!is.na(Abundance..F1..134N..Sample..2..PBS..300)) &
                                  (!is.na(Abundance..F1..127C..Sample..2..IAA..0)) &
                                  (!is.na(Abundance..F1..129C..Sample..2..IAA..9)) &
                                  (!is.na(Abundance..F1..131C..Sample..2..IAA..30)) &
                                  (!is.na(Abundance..F1..133C..Sample..2..IAA..300)))
print(paste0("Removing any peptide that was not found at every timepoint: ", (nrow(all.data) - initialcount1)))

# Removing any non phosphopeptides ----
all.data$Phosphorylated <- "No"
initialcount2 <- nrow(all.data)
for (i in 1:length(all.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", all.data$Modifications[i]) == "TRUE")){
    all.data$Phosphorylated[i] <- "Yes"
  }
}
all.data %<>% subset(Phosphorylated == "Yes")

print(paste0("Removing any non-phosphopeptides: ", (nrow(all.data) - initialcount2)))
all.data.unfiltered <- all.data
all.data.unfiltered.R2 <- all.data

# Removing peptides without a high L2FC between PBS 0 and PBS30 ----
initialcount6 <- nrow(all.data)
all.data %<>% subset(Abundance.Ratio..log2....PBS..30.....PBS..0. > .5 | Abundance.Ratio..log2....PBS..30.....PBS..0. < -.5 )
print(paste0("Removing peptides without a high L2FC between PBS 0 and PBS30: ", (nrow(all.data) - initialcount6)))

# Removing peptides without a large AUC difference between PBS and IAA profile ----
initialcount5 <- nrow(all.data)

all.data.unfiltered$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
all.data.unfiltered$AUC.PBS <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."]
                                     )))})
all.data.unfiltered$AUC.IAA <- apply(all.data.unfiltered[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2."]
                                     )))})
all.data.unfiltered$AUC.diff <- all.data.unfiltered$AUC.PBS - all.data.unfiltered$AUC.IAA
all.data.unfiltered$AUC.IAA.SSP <- apply(all.data.unfiltered[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                                "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                                "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                                "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.")], 1,
                                         function(y) {
                                           iaa.start <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2."]
                                           iaa.new0 <- 0
                                           iaa.new9 <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2."] - iaa.start
                                           iaa.new30 <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2."] - iaa.start
                                           iaa.new300 <- y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2."] - iaa.start
                                           
                                           return(trapz(x = times, y = c(
                                             iaa.new0,
                                             iaa.new9,
                                             iaa.new30,
                                             iaa.new300
                                           )))})
## Selection method: null distribution = DMSO for AUC diff
dmso.data <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_PHOSPHO_DMSO_b_PeptideGroupsAll.csv', fill = T, stringsAsFactors = F, check.names = T)
dmso.data %<>% subset(grepl("^TGGT1", all.data$Master.Protein.Accessions))

dmso.data <- dmso.data %>% subset(Quan.Info != "No Quan Values")
dmso.data <- dmso.data %>% subset((!is.na(Abundance..F3..128N..Sample..2..PBS..0)) &
                                    (!is.na(Abundance..F3..130N..Sample..2..PBS..9)) &
                                    (!is.na(Abundance..F3..132N..Sample..2..PBS..30)) &
                                    (!is.na(Abundance..F3..134N..Sample..2..PBS..300)) &
                                    (!is.na(Abundance..F3..127C..Sample..2..IAA..0)) &
                                    (!is.na(Abundance..F3..129C..Sample..2..IAA..9)) &
                                    (!is.na(Abundance..F3..131C..Sample..2..IAA..30)) &
                                    (!is.na(Abundance..F3..133C..Sample..2..IAA..300)))
dmso.data$Phosphorylated <- "No"
for (i in 1:length(dmso.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", dmso.data$Modifications[i]) == "TRUE")){
    dmso.data$Phosphorylated[i] <- "Yes"
  }
}
dmso.data %<>% subset(Phosphorylated == "Yes")

dmso.data$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
dmso.data$AUC.PBS <- apply(dmso.data[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2.")], 1,
                           function(y) {return(trapz(x = times, y = c(
                             y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."]
                           )))})
dmso.data$AUC.IAA <- apply(dmso.data[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.")], 1,
                           function(y) {return(trapz(x = times, y = c(
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2."]
                           )))})
dmso.data$AUC.diff <- dmso.data$AUC.PBS - dmso.data$AUC.IAA

## P-value generation
all.data.unfiltered$zscore.AUC.ZAP <- (all.data.unfiltered$AUC.diff - (mean(dmso.data$AUC.diff)/sd(dmso.data$AUC.diff)))
all.data.unfiltered$pvalue.AUC.ZAP <- 2*pnorm(abs(all.data.unfiltered$zscore.AUC.ZAP), lower.tail = FALSE)
par(mfrow=c(1, 2))
hist(all.data.unfiltered$pvalue.AUC.ZAP, main = 'P-value distribution', xlab = "P-value")
hist(all.data.unfiltered$zscore.AUC.ZAP, main = "Z-score distribution", xlab = "Z-score")

all.data %<>% left_join(all.data.unfiltered) %>% suppressMessages()

all.data.ranked <- all.data[order(all.data$AUC.diff),]
all.data.ranked %<>% subset(pvalue.AUC.ZAP < .05)

print(paste0("Removing peptides without a large AUC difference between PBS and IAA profile: ", (nrow(all.data.ranked) - initialcount5)))

# Saving significant data ----
# fwrite(all.data.ranked, file = "081121_significant_timecourse_data_r2.csv")
r2.data <- all.data.ranked
r2.data.all <- all.data.unfiltered
# Clustering ----

hist(all.data.ranked$pvalue.AUC.ZAP %>% trim())

clustering.data <- all.data.ranked %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.",
                                                        "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
all.data.ranked$Callout.Name <- sapply(all.data.ranked$Master.Protein.Accessions,
                                       function (x) {
                                         if(x == "TGGT1_289100"){
                                           return ("- Hook")
                                         }
                                         else( return(" ") )
                                       })

grobtosave <- plot_clusters_with_heatmap(all.data.ranked, 
                                         title = "Projection Based Clustering of CDPK1-dependent phosphopeptides (R2)",
                                         cluster.method = 'projection_based', V = cluster.vector)
# ggsave("072721_Clusters_R2.pdf", plot = grobtosave, scale = 2.2)

# *Merged replicates analysis ----

rm(all.data, all.data.ranked, all.data.unfiltered, clustering.data, dmso.data, grobtosave, 
   lineplot.legend, projectionpoints, visualization, cluster.vector, i, initialcount0,
   initialcount1, initialcount2, initialcount5, initialcount6, LC, times)

# Removing any peptide that was not found at every timepoint ----
initialcount1 <- nrow(initial.data)
all.data <- initial.data %>% subset(Quan.Info != "No Quan Values")

all.data <- all.data %>% subset((!is.na(Abundance..F1..127N..Sample..1..PBS..0)) &
                                  (!is.na(Abundance..F1..128N..Sample..2..PBS..0)) &
                                  (!is.na(Abundance..F1..129N..Sample..1..PBS..9)) &
                                  (!is.na(Abundance..F1..130N..Sample..2..PBS..9)) &
                                  (!is.na(Abundance..F1..131N..Sample..1..PBS..30)) &
                                  (!is.na(Abundance..F1..132N..Sample..2..PBS..30)) &
                                  (!is.na(Abundance..F1..133N..Sample..1..PBS..300)) &
                                  (!is.na(Abundance..F1..134N..Sample..2..PBS..300)) &
                                  (!is.na(Abundance..F1..126..Sample..1..IAA..0)) &
                                  (!is.na(Abundance..F1..127C..Sample..2..IAA..0)) &
                                  (!is.na(Abundance..F1..128C..Sample..1..IAA..9)) &
                                  (!is.na(Abundance..F1..129C..Sample..2..IAA..9)) &
                                  (!is.na(Abundance..F1..130C..Sample..1..IAA..30)) &
                                  (!is.na(Abundance..F1..131C..Sample..2..IAA..30)) &
                                  (!is.na(Abundance..F1..132C..Sample..1..IAA..300)) &
                                  (!is.na(Abundance..F1..133C..Sample..2..IAA..300)))
print(paste0("Removing any peptide that was not found at every timepoint: ", (nrow(all.data) - initialcount1)))

# Removing any non phosphopeptides ----
all.data$Phosphorylated <- "No"
initialcount2 <- nrow(all.data)
for (i in 1:length(all.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", all.data$Modifications[i]) == "TRUE")){
    all.data$Phosphorylated[i] <- "Yes"
  }
}
all.data %<>% subset(Phosphorylated == "Yes")

print(paste0("Removing any non-phosphopeptides: ", (nrow(all.data) - initialcount2)))
all.data.unfiltered <- all.data

# Merging ----

r1r2.data.all <- left_join(all.data.unfiltered, r1.data.all) %>% suppressMessages() %>%
  left_join(r2.data.all, by = (colnames(r1.data) %>% head(257-5))) %>% suppressMessages()
r1r2.data.all$Description <- 'na'
r1r2.data.all %<>% select("NAME" = "Master.Protein.Accessions",
                          "Description",
                          "AUC_PBS_R1"= "AUC.PBS.x",
                          "AUC_PBS_R2"= "AUC.PBS.y",
                          "AUC_IAA_R1"= "AUC.IAA.x",
                          "AUC_IAA_R2"= "AUC.IAA.y")
# gsea.summary <- r1r2.data.all %>% group_by(NAME, Description) %>%
#   summarise(AUC_PBS_R1 = max(AUC_PBS_R1),
#             AUC_PBS_R2 = max(AUC_PBS_R2), 
#             AUC_IAA_SSP_R1 = max(AUC_IAA_SSP_R1),
#             AUC_IAA_SSP_R2 = max(AUC_IAA_SSP_R2))

# fwrite(gsea.summary, "all_data_for_gsea_summary-MAX_AUC.csv")
# fwrite(r1r2.data.all, file = "Processed_Zap_phosphopeptides-AUCs.csv")

merged.data <- inner_join(r1.data, r2.data, by = (colnames(r1.data) %>% head(257-5)))
# To save go to "Merged AUCs: Two kinds", run it, and go to the end of the section
# Clustering ----

hist(merged.data$pvalue.AUC.ZAP %>% trim())

clustering.data <- merged.data %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.",
                                                    "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
merged.data$Callout.Name <- sapply(merged.data$Master.Protein.Accessions,
                                   function (x) {
                                     if(x == "TGGT1_289100"){
                                       return ("- Hook")
                                     }
                                     else( return(" ") )
                                   })

grobtosave <- plot_clusters_with_heatmap(merged.data, 
                                         title = "Projection Based Clustering of CDPK1-dependent phosphopeptides (merged)",
                                         cluster.method = 'projection_based', V = cluster.vector)
# ggsave("071421_Clusters_merged.pdf", plot = grobtosave, scale = 2.2)



## May 2022: Making final edits ----
## 

## Outputting all zaprinast peptides with AUCs and with CDPK1-dep peptides marked
#all.data.unfiltered currently has all the data, 
#merged data has all the significant ones

merged.data$unique.id <- get.unique.ids2(merged.data)
all.data.unfiltered$unique.id <- get.unique.ids2(all.data.unfiltered)
all.data.unfiltered$cdpk1.dep.significant <- sapply(all.data.unfiltered$unique.id,
                                                    function (id) {
                                                      return( (id %in% merged.data$unique.id))
                                                    })
# fwrite(all.data.unfiltered, file = "Processed_Zap_Phosphopeptides-CDPK1-dep-marked.csv")

## Non CDPK1-dep clustering (5/17/22)
## Getting a list of the cdpk1 independent peptides (reading in csv)

cdpk1.independent.data <- fread('2022_05_17_NH_Processed_Zap_Phosphopeptides-CDPK1-dep-marked_AWC_AUCpbspvalues.csv')
cdpk1.independent.data %<>% filter(pvalue_AUC_PBS_R1 < 0.05 & pvalue_AUC_PBS_R2 < 0.05 & cdpk1.dep.significant == FALSE)
cdpk1.independent.data$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0

#cluster: proj based
# trying projection based clustering
clustering.data <- cdpk1.independent.data %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                               "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                               "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                               "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.",
                                                               "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                               "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                               "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                               "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
cdpk1.independent.data$Callout.Name <- sapply(cdpk1.independent.data$Master.Protein.Accessions,
                                              function (x) {
                                                if(x == "TGGT1_289100"){
                                                  return ("- Hook")
                                                }
                                                else( return(" ") )
                                              }) #add markers for heatmap

# add on cdpk1-dependent data

cdpk1.dependent.data <- merged.data %>% rename("AUC_PBS_R1"= "AUC.PBS.x",
                                               "AUC_PBS_R2"= "AUC.PBS.y",
                                               "AUC_IAA_R1"= "AUC.IAA.x",
                                               "AUC_IAA_R2"= "AUC.IAA.y")

cdpk1.dependent.data$Callout.Name <- sapply(cdpk1.dependent.data$Master.Protein.Accessions,
                                            function (x) {
                                              if(x == "TGGT1_289100"){
                                                return ("- Hook")
                                              }
                                              else( return(" 0 ") )
                                              #put in locational markers
                                            })
cdpk1.dependent.data$cluster <- 0

#combine cdpk1-in + dependent ones

cdpk1.independent.data$cluster <- cluster.vector
cdpk1.all.data <- rbind(cdpk1.dependent.data, cdpk1.independent.data, fill = T)
cdpk1.all.data <- cdpk1.all.data[order(cdpk1.all.data$cluster, cdpk1.all.data$Abundance.Ratio..log2....PBS..9.....PBS..0.),]

cdpk1.all.data$cluster <- sapply(cdpk1.all.data$cluster, function (x){
  return (x+1)
})

#renaming cluster columns so that heatmap is properly ordered

cdpk1.all.data$cluster <- sapply(cdpk1.all.data$cluster, function (x){
  if (x == 2){
    return (3)
  }
  else if (x == 3){
    return (2)
  }
  else { return (x)}
})

cdpk1.all.data <- cdpk1.all.data[order(cdpk1.all.data$cluster, (-cdpk1.all.data$AUC_PBS_R1+cdpk1.all.data$AUC_PBS_R2/2)),]
cdpk1.all.data$unique.id <- get.unique.ids2(cdpk1.all.data)

cdpk1.all.data$Callout.Name <- sapply(cdpk1.all.data$unique.id,
                                            function (x) {
                                              if(x == "1.263APSAVFHPSTSAPAICFPKTGGT1_249390"){
                                                return ("1 start ---")
                                              }
                                              else if(x == "0.915ASMAPSGSPGSENDAEEKTGGT1_320100"){
                                                return ("1 end ---")
                                              }
                                              if(x == "4.419LATQGVRTGGT1_240300"){
                                                return ("2 start ---")
                                              }
                                              else if(x == "1.238LISLAELHPRTGGT1_286500"){
                                                return ("2 end ---")
                                              }
                                              if(x == "1.321GAEHGASLRTGGT1_287980"){
                                                return ("3 start ---")
                                              }
                                              else if(x == "0.893SQDGLPVSSSVASLKPGSESARTGGT1_246990"){
                                                return ("3 end ---")
                                              }
                                              if(x == "1.021NSPSVSAPSTPARTGGT1_250820"){
                                                return ("4 start ---")
                                              }
                                              else if(x == "1.352AATPVRDRSPHPTSEAPKTGGT1_230940"){
                                                return ("4 end ---")
                                              }
                                              else ( return (''))
                                            })

fwrite(cdpk1.all.data, file = "Processed_Zap_phosphopeptides-with-clusters.csv")

grobtosave <- plot_clusters_with_heatmap(cdpk1.all.data, 
                                         title = "Projection Based Clustering of CDPK1-de+in dependent phosphopeptides",
                                         cluster.method = 'projection_based', V = cdpk1.all.data$cluster %>% as.integer())

ggsave("051722_clusters.pdf", plot = grobtosave, scale = 3)

# heatmaps
# Warning: these limits on the scales make a few peptides get cut off?

min.l2fc <- min(cdpk1.all.data %>%
                  select('Abundance.Ratio..log2....PBS..0.....PBS..0.',
                         'Abundance.Ratio..log2....PBS..9.....PBS..0.',
                         'Abundance.Ratio..log2....PBS..30.....PBS..0.',
                         'Abundance.Ratio..log2....PBS..300.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..0.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..9.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..30.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..300.....PBS..0.'))
max.l2fc <- max(cdpk1.all.data %>%
                  select('Abundance.Ratio..log2....PBS..0.....PBS..0.',
                         'Abundance.Ratio..log2....PBS..9.....PBS..0.',
                         'Abundance.Ratio..log2....PBS..30.....PBS..0.',
                         'Abundance.Ratio..log2....PBS..300.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..0.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..9.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..30.....PBS..0.',
                         'Abundance.Ratio..log2....IAA..300.....PBS..0.'))
grobtosave2 <- grid.arrange(plot.peptide.heatmaps(cdpk1.all.data, min.l2fc = min.l2fc, max.l2fc = max.l2fc))

ggsave("051722_heatmaps.pdf", plot = grobtosave2, scale = 3)


# Overlap with thioP 5/17/22 ----

# Cross referencing thioP and timecourse hit datasets ----
timecourse.data <- merged.data
thioP.data.all <- read.csv("2022_05_17_ThioP_DFelu.csv", stringsAsFactors = F, check.names = T)
thioP.data <- thioP.data.all %>% filter(pvaluexl2fc.norm.abund.ratio > 3 & phospho == "TRUE")

thioP.data$ME49.ID <- lapply(thioP.data$Master.Protein.Accessions, function(x) {
  id.list <- ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] %>% as.list()
  return (id.list[1] %>% as.character())}) %>% as.character()
thioP.data %<>% left_join(phyre2, by = c("ME49.ID" = 'ID'))
thioP.data %<>% left_join(conservations, by = c("Master.Protein.Accessions" = "GT1.ID"))
timecourse.protein.data <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_WP_ZAP_b_ProteinGroupsAll.csv', stringsAsFactors = F, check.names = T) %>%
  select('Accession',
         'Sequence')

timecourse.data <- data.frame(lapply(timecourse.data, function(x) { gsub("-t26_1-p1", "", x) } ), stringsAsFactors = F )
thioP.data <- data.frame(lapply(thioP.data, function(x) { gsub("-t26_1-p1", "", x) } ), stringsAsFactors = F )
timecourse.protein.data <- data.frame(lapply(timecourse.protein.data, function(x) { gsub("-t26_1-p1", "", x) } ), stringsAsFactors = F )
timecourse.data$Protein.Sequence <- lapply(timecourse.data$Master.Protein.Accessions, function(x)
  timecourse.protein.data$Sequence[grep(x, timecourse.protein.data$Accession)] ) #puts character(0) if no match

## Remove peptides which could originate from multiple proteins (very few of these)
thioP.data <- thioP.data %>% subset(!grepl(';', Master.Protein.Accessions))
timecourse.data <- timecourse.data %>% subset(!grepl(';', Master.Protein.Accessions)) %>%
  add.back.modifications()

non.phos.thioP.data <- thioP.data %>% subset(No.Phospho == 0)
thioP.data %<>% subset(No.Phospho != 0)

thioP.data <- tibble::rowid_to_column(thioP.data, "Index")
timecourse.data <- tibble::rowid_to_column(timecourse.data, "Index")

timecourse.data$No.Phospho <- lapply(timecourse.data$Modifications, function(x) {
  return ((str_extract(x, "\\s.xPhospho")) %>%
            str_sub(start = 2, end = 2) %>%
            as.numeric())
})

thioP.data$Unique.ID <- get.unique.ids(thioP.data)
timecourse.data$Unique.ID <- get.unique.ids(timecourse.data)

thioP.data$phos.site.in.master <- retrieve.all.possible.site.numbers.from.df(thioP.data)
timecourse.data$phos.site.in.master <- retrieve.all.possible.site.numbers.from.df(timecourse.data)

hits <- data.frame(matrix(ncol=10,nrow=0, dimnames=list(NULL, c("thioP.index", 
                                                                "timecourse.index", 
                                                                "gene.id", 
                                                                "phos.site.thioP", 
                                                                'phos.site.timecourse', 
                                                                'unique.id.thioP',
                                                                'unique.id.timecourse',
                                                                'distance',
                                                                'All.Mods.thioP',
                                                                'All.Mods.timecourse'))))
max.protein.length.timecourse <- max(nchar(timecourse.data$Protein.Sequence))
invisible(apply(thioP.data[,c('phos.site.in.master',
                              'Master.Protein.Accessions',
                              'No.Phospho',
                              'Index',
                              'Unique.ID',
                              'Modifications.in.Master.Proteins..all.Sites.')], 1,
                function(y) match.nt.and.phos.site(y['phos.site.in.master'],
                                                   y['Master.Protein.Accessions'],
                                                   y['No.Phospho'],
                                                   y['Index'],
                                                   y['Unique.ID'],
                                                   y['Modifications.in.Master.Proteins..all.Sites.'],
                                                   threshold = max.protein.length.timecourse)))

## Overriding one little aspect of the cross ref function (Making it list 0 as distance if there is possible exact match)
for (i in 1:nrow(hits)){
  common.hits <- intersect(hits$phos.site.thioP[i] %>% unlist() %>% as.numeric(), 
                           hits$phos.site.timecourse[i] %>% unlist() %>% as.numeric())
  if (isTRUE((length(common.hits) != 0))){
    hits$distance[i] <- 0
  }}
hits$distance %<>% as.numeric()


# Adding hit distances columns ----

## making df with gene id and hits distances
hits$unique.id.timecourse %<>% unlist()
hits$gene.id %<>% unlist()
hits$All.Mods.thioP %<>% unlist()
hits$unique.id.thioP %<>% unlist()
hits$All.Mods.timecourse %<>% unlist()

num.proteins <- hits %>% select(gene.id) %>% distinct() %>% nrow()

gene.hits <- hits %>% select(gene.id, unique.id.thioP, All.Mods.thioP) %>% distinct()
gene.hits.TC <- hits %>% select(gene.id, unique.id.timecourse, All.Mods.timecourse) %>% distinct()

gene.hits$distance <- NA
gene.hits.TC$distance <- NA

for (i in 1:nrow(gene.hits)){
  distance.list <- c()
  for (ii in 1:nrow(hits)){
    if (isTRUE(hits$gene.id[ii] == gene.hits$gene.id[i]) &
        isTRUE(hits$unique.id.thioP[ii] == gene.hits$unique.id.thioP[i]) &
        isTRUE(hits$All.Mods.thioP[ii] == gene.hits$All.Mods.thioP[i])){
      distance.list <- c(distance.list, hits$distance[ii])
    }
  }
  gene.hits$distance[i] <- paste(distance.list, collapse = '; ')
  gene.hits$min.hit.distance[i] <- min(distance.list %>% unlist() %>% as.numeric())
}
for (i in 1:nrow(gene.hits.TC)){
  distance.list <- c()
  for (ii in 1:nrow(hits)){
    if (isTRUE(hits$gene.id[ii] == gene.hits.TC$gene.id[i]) &
        isTRUE(hits$unique.id.timecourse[ii] == gene.hits.TC$unique.id.timecourse[i]) &
        isTRUE(hits$All.Mods.timecourse[ii] == gene.hits.TC$All.Mods.timecourse[i])){
      distance.list <- c(distance.list, hits$distance[ii])
    }
  }
  gene.hits.TC$distance[i] <- paste(distance.list, collapse = '; ')
  gene.hits.TC$min.hit.distance[i] <- min(distance.list %>% unlist() %>% as.numeric())
}

## turning thioP data back to numerics to plot it

thioP.data$pvalue <- as.numeric(thioP.data$pvalue)
thioP.data$log2.Avg.Norm.AbundRatio <- as.numeric(thioP.data$log2.Avg.Norm.AbundRatio)
thioP.data$No.Phospho <- as.numeric(thioP.data$No.Phospho)
non.phos.thioP.data$pvalue <- as.numeric(non.phos.thioP.data$pvalue)
non.phos.thioP.data$log2.Avg.Norm.AbundRatio <- as.numeric(non.phos.thioP.data$log2.Avg.Norm.AbundRatio)
non.phos.thioP.data$No.Phospho <- as.numeric(non.phos.thioP.data$No.Phospho)
non.phos.thioP.data$distance <- NA
non.phos.thioP.data$hit <- NA


# Saving hit data ----

thioP.data$hit.class <- apply(thioP.data[,c("min.hit.distance",
                                            "log2.Avg.Norm.AbundRatio",
                                            "Unique.ID",
                                            "Prot.ID")], 1,
                              function(y){
                                min.hit.dist <- y["min.hit.distance"] %>% as.numeric()
                                l2normAbund <- y['log2.Avg.Norm.AbundRatio'] %>% as.numeric()
                                unid <- y['Unique.ID']
                                prot.id <- y['Prot.ID']
                                if (isTRUE(min.hit.dist == 0)){ return(1) }
                                else if (isTRUE(min.hit.dist < 50)){ return(2) }
                                else if (prot.id %in% timecourse.data$Master.Protein.Accessions){ return(3) }
                                else { return (4) }
                              })

gene.hits.TC$hit.class <- sapply(gene.hits.TC$min.hit.distance,
                              function(x){
                                if (isTRUE(x == 0)){ return(1) }
                                else if (isTRUE(x < 50)){ return(2) }
                                else { return (3) }
                              })

# fill timecourse dataset with numbers from thioP classes
timecourse.data.w.classes <- timecourse.data %>% left_join(gene.hits.TC %>% select(hit.class, unique.id.timecourse), by = c('Unique.ID' = "unique.id.timecourse"))

timecourse.data.w.classes$hit.class <- sapply(timecourse.data.w.classes$hit.class,
                                 function(x){
                                   if (isTRUE(x %>% is.na())){ return(5) }
                                   else { return (x) }
                                 })
timecourse.data.w.classes$unique.id <- get.unique.ids2(timecourse.data.w.classes)
all.data.unfiltered$unique.id <- get.unique.ids2(all.data.unfiltered)
all.timecourse.data.with.classes <- left_join(all.data.unfiltered, timecourse.data.w.classes %>% select(hit.class, unique.id), 
                                              by = 'unique.id')
fwrite(all.timecourse.data.with.classes, file = "Processed_Zap_Phosphopeptides-CDPK1-dep-marked-with-hit-classes.csv")

#merge thioP data back to being complete
thioP.data.all$Unique.ID.thiop <- get.unique.ids.thioP(thioP.data.all)
thioP.data$Unique.ID.thiop <- get.unique.ids.thioP(thioP.data)
thioP.data.all2 <- left_join(thioP.data.all, thioP.data %>% select(phos.site.in.master, distance, min.hit.distance, hit.class, Unique.ID.thiop), by = 'Unique.ID.thiop')
fwrite(thioP.data.all2 %>% apply(2,as.character), file = "Processed_DF-elu-with-hit-classes.csv")

# Whole proteome depletion ---- 
# *******
timecourse.protein.data.zap <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_WP_ZAP_b_ProteinGroupsAll.csv', stringsAsFactors = F, check.names = T)
timecourse.protein.data.zap$Accession %<>% stri_extract_first(regex = '.*(?=-t26_1-p1)')
timecourse.protein.data.zap %<>% subset(grepl("^TGGT1", timecourse.protein.data.zap$Accession))

ggplot(timecourse.protein.data.zap,
       aes(x = Abundances..Grouped...PBS..300,
           y = Abundances..Grouped...IAA..300)) +
  geom_point(stroke = 0, size = 2, color = cbPalette[1], shape = 16) +
  geom_point(data = timecourse.protein.data.zap %>% filter(Accession == "TGGT1_301440"), color = cbPalette[3], alpha = 1, size = 2, stroke = 0) +
  ylab("Abundance: IAA 300s") +
  xlab("Abundance: PBS 300s") +
  ggtitle("Whole Proteome Depletion") +
  scale_y_log10() +
  scale_x_log10() +
  aliceTheme +
  theme(aspect.ratio = 1) +
  geom_text_repel(data=filter(timecourse.protein.data.zap, Accession == "TGGT1_301440"),
                  aes(x = Abundances..Grouped...PBS..300,
                      y = Abundances..Grouped...IAA..300,
                      label = "CDPK1"), shape = 16, color = "black")
# ggsave("080921_wp-depletion.pdf", scale = 1)

# AUC difference plots ----
# Relies on running lines 1-472
# $$$FIX$$$
r1.data$unique.id <- get.unique.ids2(r1.data)
r2.data$unique.id <- get.unique.ids2(r2.data)
merged.data$unique.id <- get.unique.ids2(merged.data)
r1.data.all$unique.id <- get.unique.ids2(r1.data.all)
r2.data.all$unique.id <- get.unique.ids2(r2.data.all)
all.data$unique.id <- get.unique.ids2(all.data)
# all.data.unfiltered.R1$unique.id <- get.unique.ids2(all.data.unfiltered.R1)
# all.data.unfiltered.R2$unique.id <- get.unique.ids2(all.data.unfiltered.R2)

r1.data %<>% left_join(r2.data.all %>% select(c(AUC.diff.otherRep = AUC.diff), unique.id),
                       by = 'unique.id')
r2.data %<>% left_join(r1.data.all %>% select(c(AUC.diff.otherRep = AUC.diff), unique.id),
                       by = 'unique.id')
merged.data %<>% left_join(r1.data.all %>% select(c(AUC.diff.R1 = AUC.diff), unique.id)) %>%
  left_join(r2.data.all %>% select(c(AUC.diff.R2 = AUC.diff), unique.id))

significant.in.neither.data <- inner_join(r1.data.all %>% select(c(AUC.diff.R1 = AUC.diff), unique.id),
                                          r2.data.all %>% select(c(AUC.diff.R2 = AUC.diff), unique.id),
                                          by = 'unique.id')
significant.in.neither.data %<>%
  anti_join(merged.data, by = 'unique.id') %>%
  anti_join(r1.data, by = 'unique.id') %>%
  anti_join(r2.data, by = 'unique.id')

significant.in.neither.data %<>% left_join(all.data %>% select(Abundance.Ratio..log2....PBS..30.....PBS..0., Abundance.Ratio..log2....PBS..30.....PBS..0., unique.id))

significant.in.neither.data %<>% subset(Abundance.Ratio..log2....PBS..30.....PBS..0. > .5 | Abundance.Ratio..log2....PBS..30.....PBS..0. < -.5 )

(ggplot(significant.in.neither.data,
        aes(x = AUC.diff.R2,
            y = AUC.diff.R1, color = 'Significant in neither')) +
    geom_point(size = 2, shape = 16) +
    geom_point(data = r2.data, aes(x = AUC.diff,
                                   y = AUC.diff.otherRep, color = 'Significant in R2'),
               size = 2, color = cbPalette[3], shape = 16) +
    geom_point(data = r1.data, aes(x = AUC.diff.otherRep,
                                   y = AUC.diff, color = 'Significant in R1'),
               size = 2, color = cbPalette[2], shape = 16) +
    geom_point(data = merged.data, aes(x = AUC.diff.R2,
                                       y = AUC.diff.R1, color = 'Significant in both'),
               size = 2, color = cbPalette[8], shape = 16) +
    ylab("AUC Difference (R1)") +
    xlab("AUC Difference (R2)") +
    ggtitle("AUC Difference across replicates") +
    aliceTheme +
    scale_color_manual(values = c('Significant in neither' = cbPalette[1],
                                  'Significant in R1' = cbPalette[2], 
                                  'Significant in R2' = cbPalette[3], 
                                  'Significant in both' = cbPalette[8])) +
    theme(aspect.ratio = 1) ) 

ggsave("050522_AUC_difference_Replicate_comparison_scatter_Zapdata.pdf", scale = 1)

# Upset plots ----
all.data.unfiltered.R1$unique.id <- get.unique.ids2(all.data.unfiltered.R1)
all.data.unfiltered.R2$unique.id <- get.unique.ids2(all.data.unfiltered.R2)
all.data.unfiltered.for.upset <- full_join(all.data.unfiltered.R1 %>% select(unique.id), all.data.unfiltered.R2 %>% select(unique.id), by = "unique.id")
all.data.unfiltered.for.upset$in.r1.all <- NA
all.data.unfiltered.for.upset$in.r2.all <- NA
all.data.unfiltered.for.upset$in.r1.sig <- NA
all.data.unfiltered.for.upset$in.r2.sig <- NA


for (i in 1:nrow(all.data.unfiltered.for.upset)){
  unid <- all.data.unfiltered.for.upset$unique.id[i]
  if (unid %in% all.data.unfiltered.R1$unique.id) {
    all.data.unfiltered.for.upset$in.r1.all[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r1.all[i] <- 0} 
  if (unid %in% all.data.unfiltered.R2$unique.id) {
    all.data.unfiltered.for.upset$in.r2.all[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r2.all[i] <- 0} 
  if (unid %in% r1.data$unique.id) {
    all.data.unfiltered.for.upset$in.r1.sig[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r1.sig[i] <- 0} 
  if (unid %in% r2.data$unique.id) {
    all.data.unfiltered.for.upset$in.r2.sig[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r2.sig[i] <- 0} 
  
}
setDT(all.data.unfiltered.for.upset, keep.rownames = 'Unique_Peptide')[] %>% suppressMessages()
pdf(file="upset_zaprinast_dataset.pdf") 
upset(all.data.unfiltered.for.upset, nsets = 4, 
      mainbar.y.label = "# peptides", sets.x.label = "Exists in:")
grid.text("Peptides in ... in Zaprinast dataset",x = 0.65, y=0.95, gp=gpar(fontsize=20))
dev.off()

fwrite(all.data.unfiltered.for.upset, file = "upset_data_zaprinast_dataset.csv")



# Merged AUCs: Two kinds ----
all.data.unfiltered$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
## First: unaltered, normal AUC difference
all.data.unfiltered$AUC.PBS.unaltered <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                                      "Abundance.Ratio..log2....PBS..9.....PBS..0.",
                                                                      "Abundance.Ratio..log2....PBS..30.....PBS..0.",
                                                                      "Abundance.Ratio..log2....PBS..300.....PBS..0.")], 1,
                                               function(y) {return(trapz(x = times, y = c(
                                                 y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                                                 y["Abundance.Ratio..log2....PBS..9.....PBS..0."],
                                                 y["Abundance.Ratio..log2....PBS..30.....PBS..0."],
                                                 y["Abundance.Ratio..log2....PBS..300.....PBS..0."]
                                               )))})
all.data.unfiltered$AUC.IAA.unaltered <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....IAA..0.....PBS..0.",
                                                                      "Abundance.Ratio..log2....IAA..9.....PBS..0.",
                                                                      "Abundance.Ratio..log2....IAA..30.....PBS..0.",
                                                                      "Abundance.Ratio..log2....IAA..300.....PBS..0.")], 1,
                                               function(y) {return(trapz(x = times, y = c(
                                                 y["Abundance.Ratio..log2....IAA..0.....PBS..0."],
                                                 y["Abundance.Ratio..log2....IAA..9.....PBS..0."],
                                                 y["Abundance.Ratio..log2....IAA..30.....PBS..0."],
                                                 y["Abundance.Ratio..log2....IAA..300.....PBS..0."]
                                               )))})
all.data.unfiltered$AUC.diff.unaltered <- all.data.unfiltered$AUC.PBS.unaltered - all.data.unfiltered$AUC.IAA.unaltered
## Second: Same Starting Point (SSP). Move IAA 0 timepoint to 0 (to match PBS) and calculate AUC.
all.data.unfiltered$AUC.IAA.SSP <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....IAA..0.....PBS..0.",
                                                                "Abundance.Ratio..log2....IAA..9.....PBS..0.",
                                                                "Abundance.Ratio..log2....IAA..30.....PBS..0.",
                                                                "Abundance.Ratio..log2....IAA..300.....PBS..0.")], 1,
                                         function(y) {
                                           iaa.start <- y["Abundance.Ratio..log2....IAA..0.....PBS..0."]
                                           iaa.new0 <- 0
                                           iaa.new9 <- y["Abundance.Ratio..log2....IAA..9.....PBS..0."] - iaa.start
                                           iaa.new30 <- y["Abundance.Ratio..log2....IAA..30.....PBS..0."] - iaa.start
                                           iaa.new300 <- y["Abundance.Ratio..log2....IAA..300.....PBS..0."] - iaa.start
                                           
                                           return(trapz(x = times, y = c(
                                             iaa.new0,
                                             iaa.new9,
                                             iaa.new30,
                                             iaa.new300
                                           )))})
all.data.unfiltered$AUC.diff.SSP <- all.data.unfiltered$AUC.PBS.unaltered - all.data.unfiltered$AUC.IAA.SSP


merged.data %<>% left_join(all.data.unfiltered) %>% suppressMessages()

# fwrite(merged.data, file = "081121_significant_timecourse_data_merged.csv")
# fwrite(all.data.unfiltered, file = "081121_all_data_with_AUC.csv")

# Hook profiles ----
hook.data <- merged.data %>% subset(Master.Protein.Accessions == "TGGT1_289100")
hook.fakeclusters <- c(1, 2) %>% as.integer()
grobtosave <- plot_2clusters_with_heatmap(hook.data, 
                                          title = "Hook Peptides",
                                          cluster.method = 'projection_based', V = hook.fakeclusters)
# test.data <- merged.data %>% subset(Master.Protein.Accessions == "TGGT1_246740")
# test.data <- rbind(test.data, test.data)
# hook.fakeclusters <- c(1, 2) %>% as.integer()
# grobtosave <- plot_2clusters_with_heatmap(test.data, 
#                                           title = "Hook Peptides",
#                                           cluster.method = 'projection_based', V = hook.fakeclusters)
#
# ggsave("062921_hook.pdf", grobtosave, scale = 2.2)

# GSEA formatting ----
# merged.data <- read.csv("significant_timecourse_data.csv", check.names = T, stringsAsFactors = F, )

gsea.data <- all.data.unfiltered 
gsea.data$Unique.ID <- get.unique.ids2(gsea.data)
all.data.unfiltered$Unique.ID <- get.unique.ids2(all.data.unfiltered)
merged.data$Unique.ID <- get.unique.ids2(merged.data)

gsea.data$expr.merged <- sapply(gsea.data$Unique.ID, function (x) {
  if( isTRUE(x %!in% merged.data$Unique.ID %>% as.list() %>% unlist()) ){
    return (0)  }
  else( return (1) )
})
gsea.data$number <- 1
merged.data$number <- 1

gsea.summary <- gsea.data %>% group_by(Master.Protein.Accessions) %>%
  summarise(expr_in_merged = sum(expr.merged==1))
all.data.unfiltered$number <- 1
all.summary <- all.data.unfiltered %>% group_by(Master.Protein.Accessions) %>%
  summarise(expr_in_all = sum(number==1))
gsea.summary %<>% left_join(all.summary)

gsea.summary$Description <- 'na'
gsea.summary <- select(gsea.summary, "NAME" = "Master.Protein.Accessions",
                       "Description",
                       "expr_in_merged",
                       "expr_in_all")
# fwrite(gsea.summary, file = "significant_timecourse_data_merged-forGSEA.csv")
rm(gsea.summary, gsea.data)

# GSEA Enrichment analysis ----
integral.pm <- fread('PM.-.INTEGRAL.tsv', check.names = T)
integral.pm$ME49.ID <- lapply(integral.pm$SYMBOL, function(x) {
  id.list <- ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] 
  return (id.list[[1]])}) %>% as.character()

integral.pm <- left_join(integral.pm, phyre2, by = c("ME49.ID" = 'ID')) %>% left_join(annotation.phenotype, by = c("SYMBOL" = 'ID'))
fwrite(integral.pm, "Enriched-Integral-PM.csv")
# Chi squared test for localization / category dependence ----
## Localizations: All LOPIT known localizations
## Categories: All phosphorylated, CDPK1-dependent
## Should also run the code above just to add all necessary variables like uniqueID
LOPIT.summary <- all.data.unfiltered %>% group_by(localisation.prediction) %>%
  summarise(all.phos = sum(number==1)) %>% left_join ( merged.data %>% group_by(localisation.prediction) %>%
                                                         summarise(cdpk1.dep = sum(number==1)) )

LOPIT.summary$cdpk1.dep <- sapply(LOPIT.summary$cdpk1.dep, function (x) {
  if (isTRUE(is.na(x))){
    return (0)
  }
  else{ return (x) }
})

library("graphics")

LOPIT.summary <- head(LOPIT.summary, -1)
LOPIT.summary2 <- data.frame(LOPIT.summary, row.names = 1)
test <- as.matrix(LOPIT.summary2) %>% as.table

mosaicplot(test, shade = TRUE, las=2,
           main = "")
library("gplots")
balloonplot(t(test), main ="", xlab ="", ylab="",
            label = FALSE, show.margins = FALSE, dotsize = 10)

library("vcd")
assoc(test, shade = TRUE, las=3)
chisq.test(LOPIT.summary2)
#sad ok pie chart time

all.data.unfiltered$category <- 'all.phospho'
merged.data$category <- 'cdpk1.dep'

LOPIT.summary.pies1 <- all.data.unfiltered %>% group_by(localisation.prediction, category) %>%
  summarise(count = sum(number==1)) %>% head(-1)
LOPIT.summary.pies1$count <- sapply(LOPIT.summary.pies1$count, function (x) {
  if (isTRUE(is.na(x))){
    return (0)
  }
  else{ return (x) }
})
LOPIT.summary.pies2 <- merged.data %>% group_by(localisation.prediction, category) %>%
  summarise(count = sum(number==1))
LOPIT.summary.pies2$localisation.prediction %<>% as.character()
LOPIT.summary.pies2$localisation.prediction[1] <- 'alsounknown'
LOPIT.summary.pies2 %<>% tail(-1)
temp <- left_join(LOPIT.summary.pies1, LOPIT.summary.pies2, by = c("localisation.prediction"))
temp$count.y <- sapply(temp$count.y, function (x) {
  if (isTRUE(is.na(x))){
    return (0)
  }
  else{ return (x) }
})
temp$category.y <- "cdpk1.dep"
temp <- temp %>% select(localisation.prediction, 'category' = category.y, 'count' = count.y)

LOPIT.summary.pies1$count <- sapply(LOPIT.summary.pies1$count, function (x) {
  totalnum <- sum(LOPIT.summary.pies1$count)
  return (x / totalnum)
} )
temp$count <- sapply(temp$count, function (x) {
  totalnum <- sum(temp$count)
  return (x / totalnum)
} )

LOPIT.summary.pies <- rbind(LOPIT.summary.pies1, temp)

ggplot(data=LOPIT.summary.pies, aes(x=" ", y=count, group=localisation.prediction, colour=localisation.prediction, fill=localisation.prediction)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start=0) + 
  facet_grid(.~ category) + aliceTheme

rm(LOPIT.summary, LOPIT.summary.pies, LOPIT.summary.pies1, LOPIT.summary.pies2, LOPIT.summary2, temp, test)

# Kolmogorov test (phenotype score) ----
## to test whether all phosphorylated data and CDPK1-dep data came from different distributions
## i don't know how accurate this is, since these are discrete distributions

all.phospho.phenotypes <- all.data.unfiltered$phenotype
cdpk1.dep.phenotypes <- merged.data$phenotype
stats::ks.test(x = cdpk1.dep.phenotypes, y = all.phospho.phenotypes)
all.phenotyped.genes <- annotation.phenotype %>% select(phenotype)

## to test whether all phosphorylated data and all phenotyped came from same distributions
## the disc_ks_test are better since phenotype scores are technically a discrete distribution
library(KSgeneral)

stats::ks.test(x = all.phospho.phenotypes, y = all.phenotyped.genes$phenotype)
disc_ks_test(x = all.phospho.phenotypes, y = ecdf(x = all.phenotyped.genes$phenotype))
## to test whether cdpk1-dep data and all phenotyped came from same distri
stats::ks.test(x = cdpk1.dep.phenotypes, y = all.phenotyped.genes$phenotype)
disc_ks_test(x = cdpk1.dep.phenotypes, y = ecdf(x = all.phenotyped.genes$phenotype))


## visualising distribution 
ggplot(all.phospho.phenotypes %>% as.data.frame, aes(x=.)) +
  geom_density(size = 1, aes(color = 'All Phosphorylated'), fill = rdbuDiv[4], alpha = .4) +
  geom_density(data = cdpk1.dep.phenotypes %>% as.data.frame,
               size = 1, aes(color = 'CDPK1-dep'), fill = rdbuDiv[6], alpha = .4) +
  geom_density(data = all.phenotyped.genes,
               size = 1, aes(x = phenotype, color = 'All Phenotyped'), fill = rdbuDiv[5], alpha = .4) +
  geom_vline(data = cdpk1.dep.phenotypes %>% as.data.frame, aes(xintercept=mean(., na.rm = T)), color=rdbuDiv[9],
             linetype="dashed") +
  geom_vline(data = all.phospho.phenotypes %>% as.data.frame, aes(xintercept=mean(., na.rm = T)), color=rdbuDiv[1],
             linetype="dashed") +
  geom_vline(data = all.phenotyped.genes, aes(xintercept=mean(phenotype, na.rm = T)), color='#000000',
             linetype="dashed") +
  labs(title = "Phenotype Score Distributions", 
       x = "Phenotype Score", y ="Density") +
  scale_colour_manual(name="Line Color",
                      values=c('All Phosphorylated'=rdbuDiv[1], 'CDPK1-dep'=rdbuDiv[9],
                               'All Phenotyped' = '#000000')) +
  aliceTheme

# Chi squared test for conservation / category dependence ----
## Conservations: 
## Categories: All phosphorylated, CDPK1-dependent
## Should also run the GSA code just to add all necessary variables like uniqueID
LOPIT.summary <- all.data.unfiltered %>% group_by(type) %>%
  summarise(all.phos = sum(number==1)) %>% left_join ( merged.data %>% group_by(type) %>%
                                                         summarise(cdpk1.dep = sum(number==1)) )

LOPIT.summary$cdpk1.dep <- sapply(LOPIT.summary$cdpk1.dep, function (x) {
  if (isTRUE(is.na(x))){
    return (0)
  }
  else{ return (x) }
})

library("graphics")

LOPIT.summary <- head(LOPIT.summary, -1)
LOPIT.summary2 <- data.frame(LOPIT.summary, row.names = 1)
test <- as.matrix(LOPIT.summary2) %>% as.table

mosaicplot(test, shade = TRUE, las=2,
           main = "")
library("gplots")
balloonplot(t(test), main ="", xlab ="", ylab="",
            label = FALSE, show.margins = FALSE, dotsize = 10)

library("vcd")
assoc(test, shade = TRUE, las=3)
chisq.test(LOPIT.summary2)
#sad ok pie chart time

all.data.unfiltered$category <- 'all.phospho'
merged.data$category <- 'cdpk1.dep'

LOPIT.summary.pies1 <- all.data.unfiltered %>% group_by(type, category) %>%
  summarise(count = sum(number==1)) %>% head(-1)
LOPIT.summary.pies1$count <- sapply(LOPIT.summary.pies1$count, function (x) {
  if (isTRUE(is.na(x))){
    return (0)
  }
  else{ return (x) }
})
LOPIT.summary.pies2 <- merged.data %>% group_by(type, category) %>%
  summarise(count = sum(number==1))
LOPIT.summary.pies2$type %<>% as.character()
LOPIT.summary.pies2$type[1] <- 'alsounknown'
LOPIT.summary.pies2 %<>% tail(-1)
temp <- left_join(LOPIT.summary.pies1, LOPIT.summary.pies2, by = c("type"))
temp$count.y <- sapply(temp$count.y, function (x) {
  if (isTRUE(is.na(x))){
    return (0)
  }
  else{ return (x) }
})
temp$category.y <- "cdpk1.dep"
temp <- temp %>% select(type, 'category' = category.y, 'count' = count.y)

LOPIT.summary.pies1$count <- sapply(LOPIT.summary.pies1$count, function (x) {
  totalnum <- sum(LOPIT.summary.pies1$count)
  return (x / totalnum)
} )
temp$count <- sapply(temp$count, function (x) {
  totalnum <- sum(temp$count)
  return (x / totalnum)
} )

LOPIT.summary.pies <- rbind(LOPIT.summary.pies1, temp)

ggplot(data=LOPIT.summary.pies, aes(x=" ", y=count, group=type, colour=type, fill=type)) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar("y", start=0) + 
  facet_grid(.~ category) + aliceTheme

rm(LOPIT.summary, LOPIT.summary.pies, LOPIT.summary.pies1, LOPIT.summary.pies2, LOPIT.summary2, temp, test)

# ***Zaprinast-dependent analysis (DMSO data)----
all.data <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_PHOSPHO_DMSO_b_PeptideGroupsAll.csv', fill = T, stringsAsFactors = F, check.names = T)
all.data$Master.Protein.Accessions %<>% stri_extract_first(regex = '.*(?=-t26_1-p1)')
all.data <- left_join(all.data, annotation.phenotype, by = c("Master.Protein.Accessions" = 'ID')) %>% suppressMessages()

all.data$ME49.ID <- lapply(all.data$Master.Protein.Accessions, function(x) {
  id.list <- ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] %>% as.list()
  return (id.list[1] %>% as.character())}) %>% as.character()
all.data %<>% left_join(phyre2, by = c("ME49.ID" = 'ID'))

all.data %<>% left_join(lopit, by = c("Master.Protein.Accessions" = 'Gene_ID'))

initialcount0 <- nrow(all.data)
all.data %<>% subset(grepl("^TGGT1", all.data$Master.Protein.Accessions))
print(paste0("Removing any human peptides: ", (nrow(all.data) - initialcount0)))
initial.data <- all.data

# *First replicate analysis ----
# Removing any peptide that was not found at every timepoint ----
initialcount1 <- nrow(all.data)
all.data <- all.data %>% subset(Quan.Info != "No Quan Values")

all.data <- all.data %>% subset((!is.na(Abundance..F3..127N..Sample..1..PBS..0)) &
                                  (!is.na(Abundance..F3..129N..Sample..1..PBS..9)) &
                                  (!is.na(Abundance..F3..131N..Sample..1..PBS..30)) &
                                  (!is.na(Abundance..F3..133N..Sample..1..PBS..300)) &
                                  (!is.na(Abundance..F3..126..Sample..1..IAA..0)) &
                                  (!is.na(Abundance..F3..128C..Sample..1..IAA..9)) &
                                  (!is.na(Abundance..F3..130C..Sample..1..IAA..30)) &
                                  (!is.na(Abundance..F3..132C..Sample..1..IAA..300)))
print(paste0("Removing any peptide that was not found at every timepoint: ", (nrow(all.data) - initialcount1)))

# Removing any non phosphopeptides ----
all.data$Phosphorylated <- "No"
initialcount2 <- nrow(all.data)
for (i in 1:length(all.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", all.data$Modifications[i]) == "TRUE")){
    all.data$Phosphorylated[i] <- "Yes"
  }
}
all.data %<>% subset(Phosphorylated == "Yes")

print(paste0("Removing any non-phosphopeptides: ", (nrow(all.data) - initialcount2)))
all.data.unfiltered <- all.data
all.data.unfiltered.R1 <- all.data

# Removing peptides without a high L2FC between PBS 0 and PBS30 ----
initialcount6 <- nrow(all.data)
all.data %<>% subset(Abundance.Ratio..log2....PBS..30.....PBS..0. > .5 | Abundance.Ratio..log2....PBS..30.....PBS..0. < -.5 )
print(paste0("Removing peptides without a high L2FC between PBS 0 and PBS30: ", (nrow(all.data) - initialcount6)))

# Removing peptides without a large AUC difference between PBS and IAA profile ----
initialcount5 <- nrow(all.data)

all.data.unfiltered$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
all.data.unfiltered$AUC.PBS <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1."]
                                     )))})
all.data.unfiltered$AUC.IAA <- apply(all.data.unfiltered[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1."]
                                     )))})
all.data.unfiltered$AUC.diff <- all.data.unfiltered$AUC.PBS - all.data.unfiltered$AUC.IAA

## Selection method: null distribution = DMSO for AUC diff
dmso.data <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_PHOSPHO_DMSO_b_PeptideGroupsAll.csv', fill = T, stringsAsFactors = F, check.names = T)
dmso.data %<>% subset(grepl("^TGGT1", all.data$Master.Protein.Accessions))

dmso.data <- dmso.data %>% subset(Quan.Info != "No Quan Values")
dmso.data <- dmso.data %>% subset((!is.na(Abundance..F3..127N..Sample..1..PBS..0)) &
                                    (!is.na(Abundance..F3..129N..Sample..1..PBS..9)) &
                                    (!is.na(Abundance..F3..131N..Sample..1..PBS..30)) &
                                    (!is.na(Abundance..F3..133N..Sample..1..PBS..300)) &
                                    (!is.na(Abundance..F3..126..Sample..1..IAA..0)) &
                                    (!is.na(Abundance..F3..128C..Sample..1..IAA..9)) &
                                    (!is.na(Abundance..F3..130C..Sample..1..IAA..30)) &
                                    (!is.na(Abundance..F3..132C..Sample..1..IAA..300)))
dmso.data$Phosphorylated <- "No"
for (i in 1:length(dmso.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", dmso.data$Modifications[i]) == "TRUE")){
    dmso.data$Phosphorylated[i] <- "Yes"
  }
}
dmso.data %<>% subset(Phosphorylated == "Yes")

dmso.data$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
dmso.data$AUC.PBS <- apply(dmso.data[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1.")], 1,
                           function(y) {return(trapz(x = times, y = c(
                             y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1."]
                           )))})
dmso.data$AUC.IAA <- apply(dmso.data[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.")], 1,
                           function(y) {return(trapz(x = times, y = c(
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1."],
                             y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1."]
                           )))})
dmso.data$AUC.diff <- dmso.data$AUC.PBS - dmso.data$AUC.IAA

## P-value generation
all.data.unfiltered$zscore.AUC.ZAP <- (all.data.unfiltered$AUC.diff - (mean(dmso.data$AUC.diff)/sd(dmso.data$AUC.diff)))
all.data.unfiltered$pvalue.AUC.ZAP <- 2*pnorm(abs(all.data.unfiltered$zscore.AUC.ZAP), lower.tail = FALSE)
par(mfrow=c(1, 2))
hist(all.data.unfiltered$pvalue.AUC.ZAP, main = 'P-value distribution', xlab = "P-value")
hist(all.data.unfiltered$zscore.AUC.ZAP, main = "Z-score distribution", xlab = "Z-score")

all.data %<>% left_join(all.data.unfiltered) %>% suppressMessages()

all.data.ranked <- all.data[order(all.data$AUC.diff),]
all.data.ranked %<>% subset(pvalue.AUC.ZAP < .05)

print(paste0("Removing peptides without a large AUC difference between PBS and IAA profile: ", (nrow(all.data.ranked) - initialcount5)))

# Saving significant data ----
r1.data.all <- all.data.unfiltered
r1.data <- all.data.ranked
# fwrite(all.data.ranked, file = "081121_significant_timecourse_data_r1_dmso.csv")

# Clustering ----

hist(all.data.ranked$pvalue.AUC.ZAP %>% trim())

clustering.data <- all.data.ranked %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.1.....PBS..0.1.",
                                                        "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.1.....PBS..0.1.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.1.....PBS..0.1."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
all.data.ranked$Callout.Name <- sapply(all.data.ranked$Master.Protein.Accessions,
                                       function (x) {
                                         if(x == "TGGT1_289100"){
                                           return ("- Hook")
                                         }
                                         else( return(" ") )
                                       })

grobtosave <- plot_clusters_with_heatmap(all.data.ranked, 
                                         title = "Projection Based Clustering of CDPK1-dependent phosphopeptides (R1, DMSO)",
                                         cluster.method = 'projection_based', V = cluster.vector)
# ggsave("071421_Clusters_R1_dmso.pdf", plot = grobtosave, scale = 2.2)


# *Second replicate analysis ----
# Removing any peptide that was not found at every timepoint ----
initialcount1 <- nrow(initial.data)
all.data <- initial.data %>% subset(Quan.Info != "No Quan Values")
all.data <- all.data %>% subset((!is.na(Abundance..F3..128N..Sample..2..PBS..0)) &
                                  (!is.na(Abundance..F3..130N..Sample..2..PBS..9)) &
                                  (!is.na(Abundance..F3..132N..Sample..2..PBS..30)) &
                                  (!is.na(Abundance..F3..134N..Sample..2..PBS..300)) &
                                  (!is.na(Abundance..F3..127C..Sample..2..IAA..0)) &
                                  (!is.na(Abundance..F3..129C..Sample..2..IAA..9)) &
                                  (!is.na(Abundance..F3..131C..Sample..2..IAA..30)) &
                                  (!is.na(Abundance..F3..133C..Sample..2..IAA..300)))
print(paste0("Removing any peptide that was not found at every timepoint: ", (nrow(all.data) - initialcount1)))

# Removing any non phosphopeptides ----
all.data$Phosphorylated <- "No"
initialcount2 <- nrow(all.data)
for (i in 1:length(all.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", all.data$Modifications[i]) == "TRUE")){
    all.data$Phosphorylated[i] <- "Yes"
  }
}
all.data %<>% subset(Phosphorylated == "Yes")

print(paste0("Removing any non-phosphopeptides: ", (nrow(all.data) - initialcount2)))
all.data.unfiltered <- all.data
all.data.unfiltered.R2 <- all.data

# Removing peptides without a high L2FC between PBS 0 and PBS30 ----
initialcount6 <- nrow(all.data)
all.data %<>% subset(Abundance.Ratio..log2....PBS..30.....PBS..0. > .5 | Abundance.Ratio..log2....PBS..30.....PBS..0. < -.5 )
print(paste0("Removing peptides without a high L2FC between PBS 0 and PBS30: ", (nrow(all.data) - initialcount6)))

# Removing peptides without a large AUC difference between PBS and IAA profile ----
initialcount5 <- nrow(all.data)

all.data.unfiltered$Abundance.Ratio..log2....PBS..0.....PBS..0. <- 0
times <- c(1,2,3,4)
all.data.unfiltered$AUC.PBS <- apply(all.data.unfiltered[,c("Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratio..log2....PBS..0.....PBS..0."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."]
                                     )))})
all.data.unfiltered$AUC.IAA <- apply(all.data.unfiltered[,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                            "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.")], 1,
                                     function(y) {return(trapz(x = times, y = c(
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2."],
                                       y["Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2."]
                                     )))})
all.data.unfiltered$AUC.diff <- all.data.unfiltered$AUC.PBS - all.data.unfiltered$AUC.IAA

## Selection method: null distribution = DMSO for AUC diff
## P-value generation
all.data.unfiltered$zscore.AUC.ZAP <- (all.data.unfiltered$AUC.diff - (mean(dmso.data$AUC.diff)/sd(dmso.data$AUC.diff)))
all.data.unfiltered$pvalue.AUC.ZAP <- 2*pnorm(abs(all.data.unfiltered$zscore.AUC.ZAP), lower.tail = FALSE)
par(mfrow=c(1, 2))
hist(all.data.unfiltered$pvalue.AUC.ZAP, main = 'P-value distribution', xlab = "P-value")
hist(all.data.unfiltered$zscore.AUC.ZAP, main = "Z-score distribution", xlab = "Z-score")

all.data %<>% left_join(all.data.unfiltered) %>% suppressMessages()

all.data.ranked <- all.data[order(all.data$AUC.diff),]
all.data.ranked %<>% subset(pvalue.AUC.ZAP < .05)

print(paste0("Removing peptides without a large AUC difference between PBS and IAA profile: ", (nrow(all.data.ranked) - initialcount5)))

# Saving significant data ----
# fwrite(all.data.ranked, file = "081121_significant_timecourse_data_r2_dmso.csv")
r2.data <- all.data.ranked
r2.data.all <- all.data.unfiltered
# Clustering ----

hist(all.data.ranked$pvalue.AUC.ZAP %>% trim())

clustering.data <- all.data.ranked %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.",
                                                        "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                        "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
all.data.ranked$Callout.Name <- sapply(all.data.ranked$Master.Protein.Accessions,
                                       function (x) {
                                         if(x == "TGGT1_289100"){
                                           return ("- Hook")
                                         }
                                         else( return(" ") )
                                       })

grobtosave <- plot_clusters_with_heatmap(all.data.ranked, 
                                         title = "Projection Based Clustering of CDPK1-dependent phosphopeptides (R2, DMSO)",
                                         cluster.method = 'projection_based', V = cluster.vector)
# ggsave("071421_Clusters_R2_dmso.pdf", plot = grobtosave, scale = 2.2)

# *Merged replicates analysis ----

rm(all.data, all.data.ranked, all.data.unfiltered, clustering.data, dmso.data, grobtosave, 
   lineplot.legend, projectionpoints, visualization, cluster.vector, i, initialcount0,
   initialcount1, initialcount2, initialcount5, initialcount6, LC, times)

# Removing any peptide that was not found at every timepoint ----
initialcount1 <- nrow(initial.data)
all.data <- initial.data %>% subset(Quan.Info != "No Quan Values")

all.data <- all.data %>% subset((!is.na(Abundance..F3..127N..Sample..1..PBS..0)) &
                                  (!is.na(Abundance..F3..128N..Sample..2..PBS..0)) &
                                  (!is.na(Abundance..F3..129N..Sample..1..PBS..9)) &
                                  (!is.na(Abundance..F3..130N..Sample..2..PBS..9)) &
                                  (!is.na(Abundance..F3..131N..Sample..1..PBS..30)) &
                                  (!is.na(Abundance..F3..132N..Sample..2..PBS..30)) &
                                  (!is.na(Abundance..F3..133N..Sample..1..PBS..300)) &
                                  (!is.na(Abundance..F3..134N..Sample..2..PBS..300)) &
                                  (!is.na(Abundance..F3..126..Sample..1..IAA..0)) &
                                  (!is.na(Abundance..F3..127C..Sample..2..IAA..0)) &
                                  (!is.na(Abundance..F3..128C..Sample..1..IAA..9)) &
                                  (!is.na(Abundance..F3..129C..Sample..2..IAA..9)) &
                                  (!is.na(Abundance..F3..130C..Sample..1..IAA..30)) &
                                  (!is.na(Abundance..F3..131C..Sample..2..IAA..30)) &
                                  (!is.na(Abundance..F3..132C..Sample..1..IAA..300)) &
                                  (!is.na(Abundance..F3..133C..Sample..2..IAA..300)))
print(paste0("Removing any peptide that was not found at every timepoint: ", (nrow(all.data) - initialcount1)))

# Removing any non phosphopeptides ----
all.data$Phosphorylated <- "No"
initialcount2 <- nrow(all.data)
for (i in 1:length(all.data$Phosphorylated)){
  if(isTRUE(grepl("Phospho", all.data$Modifications[i]) == "TRUE")){
    all.data$Phosphorylated[i] <- "Yes"
  }
}
all.data %<>% subset(Phosphorylated == "Yes")

print(paste0("Removing any non-phosphopeptides: ", (nrow(all.data) - initialcount2)))
all.data.unfiltered <- all.data

# Merging ----

r1r2.data.all <- left_join(all.data.unfiltered, r1.data.all) %>% suppressMessages() %>%
  left_join(r2.data.all, by = (colnames(r1.data) %>% head(244))) %>% suppressMessages()
r1r2.data.all$Description <- 'na'
r1r2.data.all %<>% select("NAME" = "Master.Protein.Accessions",
                          "Description",
                          "AUC_PBS_R1"= "AUC.PBS.x",
                          "AUC_PBS_R2"= "AUC.PBS.y",
                          "AUC_IAA_R1"= "AUC.IAA.x",
                          "AUC_IAA_R2"= "AUC.IAA.y")

merged.data <- inner_join(r1.data, r2.data, by = (colnames(r1.data) %>% head(250-5)))

# fwrite(all.data.unfiltered, "Processed_DMSO_phosphopeptides.csv")
fwrite(r1r2.data.all, file = "Processed_DMSO_phosphopeptides-AUCs.csv")

# Clustering ----

hist(merged.data$pvalue.AUC.ZAP %>% trim())

clustering.data <- merged.data %>% dplyr::select(,c("Abundance.Ratios..log2...by.Bio..Rep.....IAA..0.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....IAA..9.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....IAA..30.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....IAA..300.2.....PBS..0.2.",
                                                    "Abundance.Ratio..log2....PBS..0.....PBS..0.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....PBS..9.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....PBS..30.2.....PBS..0.2.",
                                                    "Abundance.Ratios..log2...by.Bio..Rep.....PBS..300.2.....PBS..0.2."))
clustering.data <- data.matrix(clustering.data)
library(setRNG)
setRNG(kind="Wichmann-Hill", seed=c(08,22,2000), normal.kind="Box-Muller")
projectionpoints = NeRV(clustering.data)
visualization = GeneralizedUmatrix(Data = clustering.data,
                                   projectionpoints, PlotIt = T)
LC = c(visualization$Lines, visualization$Columns)
cluster.vector = ProjectionBasedClustering(k=3, Data = clustering.data,
                                           BestMatches = visualization$Bestmatches,
                                           LC = LC, PlotIt = T)
## Callouts
merged.data$Callout.Name <- sapply(merged.data$Master.Protein.Accessions,
                                   function (x) {
                                     if(x == "TGGT1_289100"){
                                       return ("- Hook")
                                     }
                                     else( return(" ") )
                                   })

grobtosave <- plot_clusters_with_heatmap(merged.data, 
                                         title = "Projection Based Clustering of CDPK1-dependent phosphopeptides (merged, DMSO)",
                                         cluster.method = 'projection_based', V = cluster.vector)
# ggsave("071421_Clusters_merged_dmso.pdf", plot = grobtosave, scale = 2.2)

# fwrite(merged.data, file = "significant_timecourse_data_merged_dmso.csv")

# Hook profiles ----
hook.data <- merged.data %>% subset(Master.Protein.Accessions == "TGGT1_289100")
hook.fakeclusters <- c(1, 2) %>% as.integer()
grobtosave <- plot_2clusters_with_heatmap(hook.data, 
                                          title = "Hook Peptides",
                                          cluster.method = 'projection_based', V = hook.fakeclusters)
# ggsave("062921_hook.pdf", grobtosave, scale = 2.2)

# AUC difference plots ----
r1.data$unique.id <- get.unique.ids2(r1.data)
r2.data$unique.id <- get.unique.ids2(r2.data)
merged.data$unique.id <- get.unique.ids2(merged.data)
r1.data.all$unique.id <- get.unique.ids2(r1.data.all)
r2.data.all$unique.id <- get.unique.ids2(r2.data.all)

r1.data %<>% left_join(r2.data.all %>% select(c(AUC.diff.otherRep = AUC.diff), unique.id),
                       by = 'unique.id')
r2.data %<>% left_join(r1.data.all %>% select(c(AUC.diff.otherRep = AUC.diff), unique.id),
                       by = 'unique.id')
ggplot(r1.data,
       aes(x = AUC.diff.otherRep,
           y = AUC.diff, color = 'Significant in R1')) +
  geom_point(size = 2, shape = 16) +
  geom_point(data = r2.data, aes(x = AUC.diff,
                                 y = AUC.diff.otherRep, color = 'Significant in R2'),
             size = 2, color = cbPalette[3], shape = 16) +
  ylab("AUC Difference (R1)") +
  xlab("AUC Difference (R2)") +
  ggtitle("AUC Difference across replicates") +
  aliceTheme +
  scale_color_manual(values = c('Significant in R1' = cbPalette[2], 
                                'Significant in R2' = cbPalette[3])) +
  theme(aspect.ratio = 1) 
ggsave("080921_AUC_difference_Replicate_comparison_scatter_DMSOdata.pdf", scale = 1)

# Upset plots ----
all.data.unfiltered.R1$unique.id <- get.unique.ids2(all.data.unfiltered.R1)
all.data.unfiltered.R2$unique.id <- get.unique.ids2(all.data.unfiltered.R2)
all.data.unfiltered.for.upset <- full_join(all.data.unfiltered.R1 %>% select(unique.id), all.data.unfiltered.R2 %>% select(unique.id), by = "unique.id")
all.data.unfiltered.for.upset$in.r1.all <- NA
all.data.unfiltered.for.upset$in.r2.all <- NA
all.data.unfiltered.for.upset$in.r1.sig <- NA
all.data.unfiltered.for.upset$in.r2.sig <- NA


for (i in 1:nrow(all.data.unfiltered.for.upset)){
  unid <- all.data.unfiltered.for.upset$unique.id[i]
  if (unid %in% all.data.unfiltered.R1$unique.id) {
    all.data.unfiltered.for.upset$in.r1.all[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r1.all[i] <- 0} 
  if (unid %in% all.data.unfiltered.R2$unique.id) {
    all.data.unfiltered.for.upset$in.r2.all[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r2.all[i] <- 0} 
  if (unid %in% r1.data$unique.id) {
    all.data.unfiltered.for.upset$in.r1.sig[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r1.sig[i] <- 0} 
  if (unid %in% r2.data$unique.id) {
    all.data.unfiltered.for.upset$in.r2.sig[i] <- 1  }
  else {all.data.unfiltered.for.upset$in.r2.sig[i] <- 0} 
  
}
setDT(all.data.unfiltered.for.upset, keep.rownames = 'Unique_Peptide')[] %>% suppressMessages()
pdf(file="upset_DMSO_dataset.pdf") 
upset(all.data.unfiltered.for.upset, nsets = 4, 
      mainbar.y.label = "# peptides", sets.x.label = "Exists in:")
grid.text("Peptides in ... in DMSO dataset",x = 0.65, y=0.95, gp=gpar(fontsize=20))
dev.off()

fwrite(all.data.unfiltered.for.upset, file = "upset_data_DMSO_dataset.csv")

# Creating all toxo info df: GT1's ----

source('C:/Users/Nicole/Desktop/Spring 21 UROP/071421_PhosphoproteomicsPipeline_companion.R')

## Loading in datasets containing phenotype score, toxoDB annotation, conservation information,
## phyre2 predictions, ME49/GT1 ortholog pairs, and LOPIT

lopits <- read.csv('2022_05_17_GT1_ToxoInfo_annotaiton_phenotype_lopit_EBmicIP.csv', fill = T, stringsAsFactors = F, check.names = T) %>% select('Gene_ID', 'phenotype', 'localisation.prediction')
toxodb.annotations.phenotypes.names.go <- read.csv('517-GenesByText_Summary.csv', fill = T, stringsAsFactors = F, check.names = T)

all.info.gt1 <- left_join(toxodb.annotations.phenotypes.names.go, lopits, by = c('Gene.ID' = 'Gene_ID'))

conservations <- read.csv('GT1_ConservationCategories.csv', fill = T, stringsAsFactors = F, check.names = T) %>% select(GT1.ID,type)
conservations$type %<>% stri_extract_first(regex = '(?<=\\.)[[:alnum:]]*')
annotation.phenotype <- left_join(annotation.phenotype, conservations, by = c("ID" = 'GT1.ID')) %>% suppressMessages()

conservations$ME49.ID <- lapply(conservations$GT1.ID, function(x) {
  id.list <- ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] %>% as.list()
  return (id.list[1] %>% as.character())}) %>% as.character()

## Joining datasets together
conservations %<>% left_join(annotation.phenotype, by = c("GT1.ID" = "ID")) %>%
  left_join(phyre2, by = c("ME49.ID" = 'ID')) %>%
  left_join(lopit, by = c("GT1.ID" = 'Gene_ID'))

## Selecting only relevant columns and saving
conservations %<>% select("GT1.ID", "ME49.ID", "Conservation" = "type.x", "Phenotype.Score" = "phenotype", "ToxoDB.Annotation" = "ToxoDB_Annotation", "Size", "Phyre2.Hit" = "Hit", "Phyre2.Confidence" = "Confidence", "Phyre2.Sequence.ID" = "Sequence_id", "Phyre2.Coverage.Percent" = "Coverage_percent", "Phyre2.Hit.Info.1" = "Hit_info_1",  "Phyre2.Hit.Info.2" = "Hit_info_2",  "Phyre2.Hit.Info.3" = "Hit_info_3", "LOPIT.Localization.Prediction" = "localisation.prediction")  
# fwrite(conservations, file = "081121_All_toxo_info.csv")

# Creating all toxo info df: ME49's ----

source('C:/Users/Nicole/Desktop/Spring 21 UROP/071421_PhosphoproteomicsPipeline_companion.R')

## Loading in datasets containing phenotype score, toxoDB annotation, conservation information,
## phyre2 predictions, ME49/GT1 ortholog pairs, and LOPIT
annotation.phenotype <- read.csv('GT1_Annotation_Phenotype.csv', fill = T, stringsAsFactors = F, check.names = T)
conservations <- read.csv('GT1_ConservationCategories.csv', fill = T, stringsAsFactors = F, check.names = T) %>% select(GT1.ID,type)
conservations$type %<>% stri_extract_first(regex = '(?<=\\.)[[:alnum:]]*')
annotation.phenotype <- left_join(annotation.phenotype, conservations, by = c("ID" = 'GT1.ID')) %>% suppressMessages()
phyre2 <- read.csv('Toxo_PhyrePredictions_TopHit.csv', stringsAsFactors = F)
ortholog.conversion <- read.csv('2019_11_11_GenesByOrthologs_Summary_ME49_to_TGGT1.csv',
                                fill = T, stringsAsFactors = F, check.names = T, as.is = TRUE) %>%
  select( 'Gene.ID', 'Input.Ortholog.s.') %>% as.data.frame()
lopit <-  read.csv( 'LOPIT_raw_all.csv',
                    fill = T, stringsAsFactors = F, check.names = T, as.is = T ) %>%
  select( 'Gene_ID', 'Description', 'localisation.prediction')
conservations$ME49.ID <- lapply(conservations$GT1.ID, function(x) {
  id.list <- ortholog.conversion$Gene.ID[grep(x, ortholog.conversion$Input.Ortholog.s.)] %>% as.list()
  return (id.list[1] %>% as.character())}) %>% as.character()

## Joining datasets together
conservations %<>% left_join(annotation.phenotype, by = c("GT1.ID" = "ID")) %>%
  left_join(phyre2, by = c("ME49.ID" = 'ID')) %>%
  left_join(lopit, by = c("GT1.ID" = 'Gene_ID'))

## Selecting only relevant columns and saving
conservations %<>% select("GT1.ID", "ME49.ID", "Conservation" = "type.x", "Phenotype.Score" = "phenotype", "ToxoDB.Annotation" = "ToxoDB_Annotation", "Size", "Phyre2.Hit" = "Hit", "Phyre2.Confidence" = "Confidence", "Phyre2.Sequence.ID" = "Sequence_id", "Phyre2.Coverage.Percent" = "Coverage_percent", "Phyre2.Hit.Info.1" = "Hit_info_1",  "Phyre2.Hit.Info.2" = "Hit_info_2",  "Phyre2.Hit.Info.3" = "Hit_info_3", "LOPIT.Localization.Prediction" = "localisation.prediction")  
# fwrite(conservations, file = "081121_All_toxo_info.csv")






# Figuring out metrics for unique ID ----
all.data.tester <- read.csv('20210607_CDPK1TimeCourse_TMTPro1_PHOSPHO_ZAP_b_PeptideGroupsAll.csv', fill = T, stringsAsFactors = F, check.names = F)

get.unique.ids.intrinsic <- function(df.to.add.unique.ids.to) {apply(df.to.add.unique.ids.to[,c('Modifications',
                                                                                                'Sequence',
                                                                                                'Modifications in Master Proteins')], 1,
                                                                     function(y) {return (paste0(y['Modifications'],
                                                                                                 y['Sequence'],
                                                                                                 y['Modifications in Master Proteins']))})}
all.data.tester$unid <- get.unique.ids.intrinsic(all.data.tester)










